In this Colab notebook we will focus on data mining, exploring, and applying natural language processing (NLP) techniques to three datasets that contain information from job listings scraped from Glassdoor. The datasets are available in Kaggle at the following links:
Data Scientist Job Postings (n = 3900) (link)
Data Analyst Job Postings (n = 2000) (link)
Business Analyst Job Postings (n = 4000) (link)
The structure of the three datasets is similar, with some additional features in the business analyst one that will be excluded from the final analysis.
The task is to explore the datasets and extract insights about the differences or similarities between job listings in three highly sought after fields in the Data Science market ecosystem.
import os
# Install Java
! apt-get update -qq
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
! java -version
# Install PySpark
! pip install --ignore-installed -q pyspark==2.4.4
# Install Spark NLP
! pip install --ignore-installed -q spark-nlp
# Install Top2Vec
! pip install top2vec
! pip install top2vec[sentence_encoders]
# Install BERTopic
! pip install bertopic
! pip install bertopic[visualization]
from google.colab import drive
import pandas as pd
import numpy as np
import re
import json
import copy
import pickle
from scipy.cluster.hierarchy import linkage, dendrogram
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from top2vec import Top2Vec
from bertopic import BERTopic
# importing visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
# importing scikitlearn modules
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KernelDensity
from sklearn.decomposition import TruncatedSVD, NMF, PCA, LatentDirichletAllocation
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans, DBSCAN
from sklearn.manifold import TSNE
from sklearn.feature_extraction import text
from sklearn.preprocessing import normalize, LabelEncoder
# importing PySpark
from pyspark.ml import Pipeline
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.clustering import LDA
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
# importing Spark NLP
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.embeddings import *
from sparknlp.pretrained import PretrainedPipeline
import sparknlp
nltk.download('stopwords')
root_path = '/content/drive/Shareddrives/Data Mining Group Project/'
output_path = '/content/drive/Shareddrives/Data Mining Group Project/presentation/'
# drive.flush_and_unmount()
drive.mount('/content/drive')
# for GPU training >> sparknlp.start(gpu=True)
spark = sparknlp.start(gpu = False)
print("Apache Spark version:", spark.version)
print("Spark NLP version", sparknlp.version())
# reading in data for each job type
da_data = pd.read_csv(root_path + 'data/DataAnalyst.csv')
ds_data = pd.read_csv(root_path + 'data/DataScientist.csv')
ba_data = pd.read_csv(root_path + 'data/BusinessAnalyst.csv')
print(da_data.info())
da_data.head(3)
print(ds_data.info())
ds_data.head(3)
print(ba_data.info())
ba_data.head(3)
# first two columns of business analyst dataset are type object whereas
# other datasets are of type int, suggesting there are might be text values.
# searching for rows with text shows that columns for business dataset are
# shifted to the left by 2 (last two columns as NaNs)
ba_data[ba_data['Unnamed: 0'].str.contains('[a-zA-Z]', regex=True)].tail(3)
# finding indices of rows that need to be shifted back and correcting
shifted_rows = ba_data.index[ba_data['Unnamed: 0'].str.contains('[a-zA-Z]+')]
ba_data.iloc[shifted_rows, 2:] = ba_data.iloc[shifted_rows, 0:-2].values
# dropping beginning columns after correction
ba_data = ba_data.iloc[:, 2:]
# beginning columns for other two datasets can be dropped as well
da_data = da_data.iloc[:, 1:]
ds_data = ds_data.iloc[:, 2:]
# printing out columns to inspect alignment
# notice that business analyst data types differ due to originally
# shifted data
print(da_data.info())
print('\n')
print(ds_data.info())
print('\n')
print(ba_data.info())
# making sure shift worked
ba_data.tail(3)
# creating a new column in each dataset indicating job type
da_data['Job Type'] = 'Data Analyst'
ba_data['Job Type'] = 'Business Analyst'
ds_data['Job Type'] = 'Data Scientist'
# business analyst dataset has all columns as type obj, which will result
# in combined dataframe to type cast all columns to object type
# we want appropriate data types so we create a data type dictionary by
# using data analyst types (could also use data scientist types)
type_dict = dict(zip(da_data.columns, da_data.dtypes.tolist()))
# combining all three datasets into one to continue data cleanup
# as well as converting dtypes after concatenation
data = pd.concat([da_data, ba_data, ds_data], ignore_index=True).astype(type_dict)
# cleaning up column names so data is easier to work with
data.columns = da_data.columns.str.title().str.replace(' ', '')
# displaying
data.info()
# will be using kde to estimate values
# want to find the distribution of company founded years
# to do this we drop missing values so they don't skew distribution
training_years = data[data['Founded'] != -1]['Founded'].to_numpy().reshape(-1, 1)
# number of missing values and their indexes
missing_idx = data[data['Founded'] == -1].index
n_missing = len(missing_idx)
# # using grid search to find the optimal bandwith for kde
# # to skip wait, use bandwidth = 1.85
# bandwidth = np.linspace(0.01, 2.00, 50)
kde = KernelDensity(kernel='gaussian', bandwidth=1.85)
kde.fit(training_years) # comment this line out when doing grid search
# grid = GridSearchCV(kde, {'bandwidth': bandwidth})
# grid.fit(training_years)
# # updating kde object to now use the estimator determined through grid search
# kde = grid.best_estimator_
# using optimized kde to generate random samples
kde_sample = kde.sample(n_samples=n_missing, random_state=1)
# function to scale values into a predefined range
# needed becase KDE can produce values outside of obesrved values
# for example, we can see 2023 as a year
def rescale(value, s_min, s_max, t_min, t_max):
return (value - s_min) / (s_max - s_min) * (t_max - t_min) + t_min
# rescaling generated values to make sure nothing falls outside of existing range
kde_sample_rescaled = np.array([round(rescale(year, kde_sample.min(), kde_sample.max(), training_years.min(), training_years.max())) for year in kde_sample.flatten()])
# replacing missing values with the random data points
data.loc[missing_idx, 'Founded'] = kde_sample_rescaled
# converting founded year to company age
# and then dropping original column
data['OrganizationAge'] = 2021 - data['Founded']
data = data.drop(columns=['Founded'])
# want to find the distribution of ratings
# to do this we drop missing values so they don't skew distribution
training_ratings = data[data['Rating'] != -1]['Rating'].to_numpy().reshape(-1, 1)
# number of missing values and their indexes
missing_idx = data[data['Rating'] == -1].index
n_missing = len(missing_idx)
# # using grid search to find the optimal bandwith for kde
# # to skip wait, use bandwidth = 0.01
# bandwidth = np.linspace(0.01, 2.00, 50)
kde = KernelDensity(kernel='gaussian', bandwidth=0.01)
kde.fit(training_ratings)
# grid = GridSearchCV(kde, {'bandwidth': bandwidth})
# grid.fit(training_ratings)
# # updating kde object to now use the estimator determined through grid search
# kde = grid.best_estimator_
# using optimized kde to generate random samples
kde_sample = kde.sample(n_samples=n_missing, random_state=1)
# rescaling generated values to make sure nothing falls outside of existing range
# using the same function as defined previously
kde_sample_rescaled = np.array([round(rescale(rating, kde_sample.min(), kde_sample.max(), training_ratings.min(), training_ratings.max()), 1) for rating in kde_sample.flatten()])
# replacing missing values with the random data points
data.loc[missing_idx, 'Rating'] = kde_sample_rescaled
# cleaning up job description by replacing new line characters
data['JobDescription'] = data['JobDescription'].str.replace('\n', ' ', regex=True)
# company name column has some ratings in the name that are stragglers
# we want to remove these
data['CompanyName'] = data['CompanyName'].str.replace('\n\d\.\d$', '', regex=True)
# dropping columns that aren't needed for analysis
# revenue data has a majority of missing values
data = data.drop(columns=['EasyApply', 'Competitors', 'Revenue'])
# salary column contains both salaried and hourly compensation info
hourly_mask = data['SalaryEstimate'].str.contains('Per Hour')
# we can see salary values are strings from which we can extract float values
print('Example of hourly syntax:', data[hourly_mask].iloc[0, 1])
print('Example of salary syntax:', data[~hourly_mask].iloc[0, 1])
# using a regex pattern with capture groups to extract numbers
# first group captures first value, second group for second value
salary_pattern = '\$*(\d+)[kK]*-\$*(\d+)[kK]*.*'
# creating a two-column df frame, one column for each capture group
# one column serves as lower bound, other column serves as upper bound
salary_df = (data['SalaryEstimate']
.str.replace('\s', '', regex=True)
.str.extract(salary_pattern)
.rename(columns={0: 'SalaryLower', 1: 'SalaryUpper'})
.astype('float')
)
# displaying df
salary_df.head(3)
# currently, salary bounds are either in units of $/hr or $k/year and
# we want to convert to annual rates
# hourly rates should by multiplied by work hours in a year
# salaried rates should by multiplied by 1k
salary_df.loc[hourly_mask, :] = salary_df.loc[hourly_mask, :].values * 2_080
salary_df.loc[~hourly_mask, :] = salary_df.loc[~hourly_mask, :].values * 1_000
salary_df['SalaryAvg'] = (salary_df['SalaryLower'] + salary_df['SalaryUpper']) // 2
# combining the salary column with the existing dataset
# and dropping original columns
data = pd.concat([data, salary_df], axis=1).drop(columns=['SalaryEstimate'])
# displaying resulting df
data.head(3)
# creating categorical variable column for type of ownership
# ownership type can be reduced to fewer categories
# value counts are omitted
ownership_map = {
'Company - Private': 1,
'Company - Public': 1,
'-1': 1,
'Nonprofit Organization': 0,
'Subsidiary or Business Segment': 1,
'Government': 0,
'College / University': 0,
'Unknown': 1,
'Hospital': 0,
'Contract': 1,
'Other Organization': 0,
'Private Practice / Firm': 1,
'School / School District': 0,
'Self-employed': 0,
'Franchise': 1
}
data['IsBusiness'] = data['TypeOfOwnership'].map(ownership_map)
# dropping the column that won't be needed anymore
data = data.drop(columns=['TypeOfOwnership'])
# creating categorical variable column for size of company
size_map = {
'1 to 50 employees': 0,
'51 to 200 employees': 1,
'201 to 500 employees': 2,
'501 to 1000 employees': 3,
'1001 to 5000 employees': 4,
'5001 to 10000 employees': 5,
'10000+ employees': 6,
-1: None,
'Unknwon': None
}
# replacing current value columns with values as defined by map
data['Size'] = data['Size'].map(size_map)
# we want to impute missing values and will do so by looking at the sampling
# distribution of existing values
# we need to grab the indices of the observation with missing values (to fill them)
# and also the number of missing values to generate a random sample of that size
missing_idx = data[data['Size'].isna()].index
n_missing = len(missing_idx)
# generating random values based on distribution of non-null values
# distribution is just a simple frequency value for each class
# important to sort value counts not by count but by their value (0 - 6)
# so we can use np.random.choice to assign element-wise probability values
size_counts = data[data['Size'].notna()]['Size'].value_counts(dropna=False).sort_index().values
size_freq = size_counts / size_counts.sum()
rand_sample = np.random.choice(range(0, 7), size=n_missing, p=size_freq)
# assigning random sizes to missing values
data.loc[missing_idx, 'Size'] = rand_sample
data.head(3)
# saving the cleaned dataset to the GDrive
data.to_csv(root_path + '/data/clean_data.csv')
# reading the cleaned dataset from GDrive
data = pd.read_csv(root_path + '/data/clean_data.csv')
# creating the Spark dataframe for JobTitle and JobDescription
spark_df = spark.createDataFrame(data[['JobTitle', 'JobDescription']])
spark_df.show()
# Spark NLP requires the input dataframe or column to be converted to document
documentAssemblerJobTitle = DocumentAssembler()\
.setInputCol("JobTitle")\
.setOutputCol('document')\
.setCleanupMode('disabled')
documentAssemblerJobDescription = DocumentAssembler()\
.setInputCol("JobDescription")\
.setOutputCol('document')\
.setCleanupMode('disabled')
# Split the document into sentences
sentencerDL = SentenceDetectorDLModel.pretrained(name = "sentence_detector_dl",
lang = "en")\
.setInputCols(["document"])\
.setOutputCol("sentence")
# Split sentence to tokens
tokenizer = Tokenizer()\
.setInputCols(['sentence'])\
.setOutputCol('token')\
.setCaseSensitiveExceptions(False)
# Clean unwanted characters
normalizer = Normalizer()\
.setInputCols(["token"])\
.setOutputCol("normalizedToken")\
.setCleanupPatterns(["[^\w\d\s]"])\
.setLowercase(True)
# Remove stopwords
stopwordsCleaner = StopWordsCleaner()\
.setInputCols(["normalizedToken"])\
.setOutputCol("cleanToken")\
.setCaseSensitive(False)\
.setLazyAnnotator(False)
# Apply spell checking
spellChecker = NorvigSweetingModel.pretrained()\
.setInputCols(["cleanToken"])\
.setOutputCol("checkedToken")\
.setLazyAnnotator(False)
# Stems tokens
stemmer = Stemmer()\
.setInputCols(["checkedToken"])\
.setOutputCol("stemToken")\
.setLanguage('English')\
.setLazyAnnotator(False)
# Lemmatizes tokens
lemmatizer = LemmatizerModel.pretrained(name = "lemma_antbnc", lang = "en")\
.setInputCols(['checkedToken'])\
.setOutputCol('lemmaToken')\
.setLazyAnnotator(False)
# Assembles tokens into documents
tokenAssemblerStem = TokenAssembler()\
.setInputCols(["document", "stemToken"])\
.setOutputCol("assembledStem")
# Assembles tokens into documents
tokenAssemblerLemma = TokenAssembler()\
.setInputCols(["document", "lemmaToken"])\
.setOutputCol("assembledLemma")
# Finisher helps us to bring back the expected structure array of the tokens
finisher = Finisher()\
.setInputCols(["assembledLemma",
"lemmaToken"])\
.setOutputCols(["assembledLemma",
"lemmaToken"])\
.setCleanAnnotations(True)\
.setIncludeMetadata(False)\
.setOutputAsArray(True)
# Organizes the pipeline stages
processedPipelineJobTitle = Pipeline()\
.setStages([documentAssemblerJobTitle,
sentencerDL,
tokenizer,
normalizer,
stopwordsCleaner,
spellChecker,
stemmer,
lemmatizer,
tokenAssemblerStem,
tokenAssemblerLemma])
processedPipelineJobDescription = Pipeline()\
.setStages([documentAssemblerJobDescription,
sentencerDL,
tokenizer,
normalizer,
stopwordsCleaner,
spellChecker,
stemmer,
lemmatizer,
tokenAssemblerStem,
tokenAssemblerLemma])
# Fitting the pipelines
processedSparkJobTitle = processedPipelineJobTitle.fit(spark_df).transform(spark_df)
processedSparkJobDescription = processedPipelineJobDescription.fit(spark_df).transform(spark_df)
processedSparkJobDescription.printSchema()
# extracting the lemmatized and stemmed tokens from the spark dataframe
# processed_JobTitle = processedSparkJobTitle.select(F.explode(F.arrays_zip('document.result',
# 'assembledLemma.result',
# 'assembledStem.result')).alias("cols")) \
# .select(F.expr("cols['0']").alias("JobTitle"),
# F.expr("cols['1']").alias("JobTitle_lemmas"),
# F.expr("cols['2']").alias("JobTitle_stems")).toPandas()
# processed_JobTitle.head()
# extracting the lemmatized and stemmed tokens from the spark dataframe
# processed_JobDescription = processedSparkJobDescription.select(F.explode(F.arrays_zip('document.result',
# 'assembledLemma.result',
# 'assembledStem.result')).alias("cols")) \
# .select(F.expr("cols['0']").alias("JobDescription"),
# F.expr("cols['1']").alias("JobDescription_lemmas"),
# F.expr("cols['2']").alias("JobDescription_stems")).toPandas()
# processed_JobDescription.head()
# merging the processed JobTitle and JobDescription data frames
# processed_text = pd.merge(processed_JobTitle,
# processed_JobDescription,
# left_index = True,
# right_index = True)
# saving the processed text columns to GDrive
# processed_text.to_csv(root_path + '/data/processed_text.csv')
# loading the processed text columns from GDrive
processed_text = pd.read_csv(root_path + '/data/processed_text.csv')
processed_text.head()
# extracting only the lemmas
# JobTitle_lemmas = processedSparkJobTitle.select(F.explode(F.arrays_zip('lemmaToken.result')).alias("cols")) \
# .select(F.expr("cols['0']").alias("JobTitle_lemmas")).toPandas()
# JobDescription_lemmas = processedSparkJobDescription.select(F.explode(F.arrays_zip('lemmaToken.result')).alias("cols")) \
# .select(F.expr("cols['0']").alias("JobDescription_lemmas")).toPandas()
# # extracting only the stems
# JobTitle_stems = processedSparkJobTitle.select(F.explode(F.arrays_zip('stemToken.result')).alias("cols")) \
# .select(F.expr("cols['0']").alias("JobTitle_stems")).toPandas()
# JobDescription_stems = processedSparkJobDescription.select(F.explode(F.arrays_zip('stemToken.result')).alias("cols")) \
# .select(F.expr("cols['0']").alias("JobDescription_stems")).toPandas()
# saving the lemmas
# JobTitle_lemmas.to_csv(root_path + '/data/JobTitle_lemmas.csv')
# JobDescription_lemmas.to_csv(root_path + '/data/JobDescription_lemmas.csv')
# saving the stems
# JobTitle_stems.to_csv(root_path + '/data/JobTitle_stems.csv')
# JobDescription_stems.to_csv(root_path + '/data/JobDescription_stems.csv')
# reading the lemmas and stems
JobTitle_lemmas = pd.read_csv(root_path + '/data/JobTitle_lemmas.csv')
JobDescription_lemmas = pd.read_csv(root_path + '/data/JobDescription_lemmas.csv')
JobTitle_stems = pd.read_csv(root_path + '/data/JobTitle_stems.csv')
JobDescription_stems = pd.read_csv(root_path + '/data/JobDescription_stems.csv')
# split data into features and target variable
y = data['JobType']
X = data.drop('JobType', axis = 1)
# split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X,
y,
test_size = 0.2,
random_state = 1)
# looking at the salary distribution of each job type
salary_plot = sns.displot(x='SalaryAvg', data=data, hue='JobType', kde=True, aspect=1.5)
# saving the figure
salary_fig = salary_plot.fig
salary_fig.savefig(output_path + 'salary_hist_kf.png')
# plotting the distribution of rating
rating_plot = sns.displot(x='Rating', data=data, hue='JobType', kde=True, aspect=1.5)
# saving the figure
rating_fig = rating_plot.fig
rating_fig.savefig(output_path + 'rating_hist_kf.png')
# plotting the rating distribution
size_plot = sns.countplot(x='Size', hue='JobType', saturation=0.5, data=data)
# saving the figure
size_fig = size_plot.get_figure()
size_fig.savefig(output_path + 'size_bar_kf.png')
# taking a look at the correlations between certain features
data[['Rating', 'SalaryAvg', 'OrganizationAge', 'Size', 'JobType']].replace({'JobType': {'Data Analyst': 0, 'Business Analyst': 1, 'Data Scientist': 2}}).corr()
# creating data scientist mask
ds_mask = data['JobType'] == 'Data Scientist'
# Join the different lemmas together
ds_lemmas = " ".join(list(processed_text[ds_mask]['JobDescription_lemmas'].values))
# Create a WordCloud object
wordcloud = WordCloud(background_color = "white",
max_words = 5000,
contour_width = 3,
contour_color = 'steelblue')
# Generate a word cloud
wordcloud.generate(ds_lemmas)
# Save the word cloud
wordcloud.to_file(output_path + "/ds_wordcloud.png")
# Visualize the word cloud
wordcloud.to_image()
# remove the rows that have NA as JobDescription
processed_text['JobType'] = data['JobType']
processed_text.drop(processed_text[processed_text['JobDescription_lemmas'].isna()].index, inplace = True)
# creating data analyst mask
da_mask = data['JobType'] == 'Data Analyst'
# Join the different lemmas together
da_lemmas = " ".join(list(processed_text[da_mask]['JobDescription_lemmas'].values))
# Create a WordCloud object
wordcloud = WordCloud(background_color = "white",
max_words = 5000,
contour_width = 3,
contour_color = 'steelblue')
# Generate a word cloud
wordcloud.generate(da_lemmas)
# Save the word cloud
wordcloud.to_file(output_path + "/da_wordcloud.png")
# Visualize the word cloud
wordcloud.to_image()
# creating data analyst mask
ba_mask = data['JobType'] == 'Business Analyst'
# Join the different lemmas together
ba_lemmas = " ".join(list(processed_text[ba_mask]['JobDescription_lemmas'].values))
# Create a WordCloud object
wordcloud = WordCloud(background_color = "white",
max_words = 5000,
contour_width = 3,
contour_color = 'steelblue')
# Generate a word cloud
wordcloud.generate(ba_lemmas)
# Save the word cloud
wordcloud.to_file(output_path + "/ba_wordcloud.png")
# Visualize the word cloud
wordcloud.to_image()
#NMF: Making a UDF to run NMF
def nmf_function(num_components, doc_text_matrix, vectorizer):
nmf = NMF(num_components)
doc_topic = nmf.fit_transform(doc_text_matrix)
index = []
for i in range(num_components):
index.append(i)
topic_word = pd.DataFrame(nmf.components_.round(3),
index = index,
columns = vectorizer.get_feature_names())
print(display_topics(nmf, vectorizer.get_feature_names(), 15))
#Running tfidf on job descriptions
stop_words = text.ENGLISH_STOP_WORDS
cv_tfidf = TfidfVectorizer(stop_words=stop_words, min_df=0.1, max_df=0.7)
x_tfidf = cv_tfidf.fit_transform(data.JobDescription).toarray()
df_tfidf = pd.DataFrame(x_tfidf,columns=cv_tfidf.get_feature_names())
job_titles = data['JobType'].values
#run NMF on output of TFIDF
def nmf_HMatrix(num_components, doc_text_matrix, vectorizer):
nmf = NMF(num_components)
doc_topic = nmf.fit_transform(doc_text_matrix)
idx = []
for i in range(num_components):
idx.append(i)
H = pd.DataFrame(doc_topic.round(3),
index = job_titles,
columns = idx)
return H
h9 = nmf_HMatrix(9,df_tfidf,cv_tfidf)
h9
X=x_tfidf
model=NMF(9)
model.fit(X)
nmf_features = model.transform(X)
components_df = pd.DataFrame(model.components_, columns=cv_tfidf.get_feature_names())
#get top words per topic from output of NMF
for topic in range(components_df.shape[0]):
tmp = components_df.iloc[topic]
print(f'For topic {topic+1} the words with the highest value are:')
print(tmp.nlargest(10))
print('\n')
#running kmean clustering on output of NMF
kmeans9 = KMeans(n_clusters=3,random_state=555)
clustering_ori9 = kmeans9.fit_predict(h9)
kmeans9.cluster_centers_
#TSNE on kmeans of NMF output
labels= kmeans9.predict(h9)
label=["cluster0", "cluster1", "cluster2"]
model=TSNE(learning_rate=100)
Tsne_transformed=model.fit_transform(h9)
xs =Tsne_transformed[:,0]
ys=Tsne_transformed[:,1]
scatter=plt.scatter(xs,ys, c=labels, alpha=.6)
handles, _ = scatter.legend_elements(prop='colors')
plt.legend(handles, label)
#comparing outputs of kmeans clusters (on output of NMF on TFIDF) to actual job descriptions
cluster_comparison = pd.DataFrame(kmeans9.predict(h9), job_titles)
cluster_comparison["cluster"]=cluster_comparison[0]
cluster_comparison = cluster_comparison.drop(0, axis=1)
cluster_comparison.reset_index(inplace=True)
cluster_comparison.groupby(['index', 'cluster']).size()
Hierarchical Clustering
#hierarchical clustering on output of TFIDF only
labels=list(job_titles)
x = df_tfidf.values
normalized_x = normalize(x)
plt.figure(figsize=(15,12))
mergings = linkage(normalized_x, method='ward')
dendrogram(mergings,
labels=labels,
leaf_rotation=90,
leaf_font_size=8
)
plt.show()
#heirarchical clustering on output of NMF on TFIDF
labels=list(job_titles)
x=h9.values
normalized_x = normalize(x)
plt.figure(figsize=(15,12))
mergings = linkage(normalized_x, method='ward')
dendrogram(mergings,
labels=labels,
leaf_rotation=90,
leaf_font_size=8
)
plt.show()
# downloading stop words from nltk package
stop_words = stopwords.words('english')
# to account for misspelling of conjunctions, we are adding conjunctions
# without the apostrophes
stop_words_mispelled = [word.replace("'", '') for word in stop_words]
stop_words = list(set(stop_words + stop_words_mispelled))
# creating corpus using job descriptions with some minor text cleanup
corpus = data['JobDescription'].str.lower().str.replace('[-,;\.:]', '', regex=True)
# creating tfidf vector object to create a feature space
# token pattern is custom to allow single-word tokens
tfidf_obj = TfidfVectorizer(token_pattern=r'(?u)\b[a-z]+\b', stop_words=stop_words)
# creating truncated svd object to reduce the feature space
# using 100 dimensions according to documentation
svd_obj = TruncatedSVD(n_components=100)
# creating vectorization of corpus
tfidf_corpus = tfidf_obj.fit_transform(corpus)
# reducing dimensions
tfidf_corpus_red = svd_obj.fit_transform(tfidf_corpus)
# creating a list of three job types
# will be used to add poinst to plots one type at a time
job_types = data['JobType'].unique()
# the figure will have n_rows * n_cols number of subplots
n_rows = 3
n_cols = 2
# instantiated a figure object
fig, axs = plt.subplots(n_rows, n_cols, figsize=(20, 20))
# initializing perplexity at 0 and will incrementally increase for each subplot
# learn rate was chosen to be 200 after several attempts at other learning rates
perplexity = 0
learn_rate = 200
# plotting the data for one subplot at a time
for row in range(n_rows):
for col in range(n_cols):
# running tsne on the data with the current loop perplexity
perplexity += 10
tsne = TSNE(n_components=2, perplexity=perplexity, learning_rate=learn_rate)
tsne_reduced = tsne.fit_transform(tfidf_corpus_red)
# adding points for one job type at at time
for j_type in job_types:
job_mask = data['JobType'] == j_type
axs[row, col].scatter(tsne_reduced[job_mask, 0],
tsne_reduced[job_mask, 1],
label=j_type,
alpha=0.30,
edgecolors='#000000',
s=40)
# adding axis labels
axs[row, col].set_title(f'Perplexity = {perplexity}', size=16, weight='bold')
axs[row, col].set_xlabel('Component 1', size=14, weight='bold')
axs[row, col].set_ylabel('Component 2', size=14, weight='bold')
# adding legend
axs[row, col].legend(loc='lower right')
plt.tight_layout()
fig.savefig(output_path + f'tsne_learning_rate_{learn_rate}.png', facecolor='white', transparent=False)
tfidf = TfidfVectorizer(max_df=0.9,min_df=2,stop_words='english')
tfidf_fit = tfidf.fit_transform(data['JobDescription'])
# generate 3 topic
lda = LatentDirichletAllocation(n_components=3,random_state=42)
lda_fit = lda.fit(tfidf_fit)
# extracting the keywordss in each topic
for id_value, value in enumerate(lda_fit.components_):
print(f"The topic would be {id_value}")
print([tfidf.get_feature_names()[index] for index in value.argsort()[-10:]])
print("\n")
ds_lda = data[data['JobType']=='Data Scientist']['JobDescription']
tfidf = TfidfVectorizer(max_df=0.9,min_df=2,stop_words='english')
ds_tfidf_fit = tfidf.fit_transform(ds_lda)
# generate 3 topic
lda = LatentDirichletAllocation(n_components=3,random_state=42)
ds_lda_fit = lda.fit(ds_tfidf_fit)
# extracting the keywordss in each topic
for id_value, value in enumerate(ds_lda_fit.components_):
print(f"The topic would be {id_value}")
print([tfidf.get_feature_names()[index] for index in value.argsort()[-10:]])
print("\n")
da_lda = data[data['JobType']=='Data Analyst']['JobDescription']
tfidf = TfidfVectorizer(max_df=0.9,min_df=2,stop_words='english')
da_tfidf_fit = tfidf.fit_transform(da_lda)
# generate 3 topic
lda = LatentDirichletAllocation(n_components=3,random_state=42)
da_lda_fit = lda.fit(da_tfidf_fit)
# extracting the keywordss in each topic
for id_value, value in enumerate(da_lda_fit.components_):
print(f"The topic would be {id_value}")
print([tfidf.get_feature_names()[index] for index in value.argsort()[-10:]])
print("\n")
ba_lda = data[data['JobType']=='Business Analyst']['JobDescription']
tfidf = TfidfVectorizer(max_df=0.9,min_df=2,stop_words='english')
ba_tfidf_fit = tfidf.fit_transform(ba_lda)
# generate 3 topic
lda = LatentDirichletAllocation(n_components=3,random_state=42)
ba_lda_fit = lda.fit(ba_tfidf_fit)
# extracting the keywordss in each topic
for id_value, value in enumerate(ba_lda_fit.components_):
print(f"The topic would be {id_value}")
print([tfidf.get_feature_names()[index] for index in value.argsort()[-10:]])
print("\n")
# parsing raw JobDescription text
docs_raw = list(data['JobDescription'].values)
docs_raw[:5]
# fitting the Top2Vec model using USE embeddings on the raw job description text
# top2vec_raw = Top2Vec(docs_raw,
# speed = 'deep-learn',
# embedding_model = 'universal-sentence-encoder')
# saving raw Top2Vec model
# top2vec_raw.save(root_path + "/data/top2vec_raw")
# loading raw Top2Vec model
top2vec_raw = Top2Vec.load(root_path + "/data/top2vec_raw")
# getting number of topics
top2vec_raw.get_num_topics()
# getting top 3 topics
topic_words_raw, word_scores_raw, topic_nums_raw = top2vec_raw.get_topics(3)
# plotting word clouds for the top 5 topics
for topic in topic_nums_raw:
top2vec_raw.generate_topic_wordcloud(topic)
# parsing raw data scientist text
ds_raw = list(data[data['JobType'] == 'Data Scientist']['JobDescription'].values)
ds_raw[:5]
# fitting the Top2Vec model using USE embeddings on the raw data scientist text
# top2vec_ds_raw = Top2Vec(ds_raw,
# speed = 'deep-learn',
# embedding_model = 'universal-sentence-encoder')
# saving raw Top2Vec model
# top2vec_ds_raw.save(root_path + "/data/top2vec_ds_raw")
# loading raw Top2Vec model
top2vec_ds_raw = Top2Vec.load(root_path + "/data/top2vec_ds_raw")
# getting number of topics
top2vec_ds_raw.get_num_topics()
# getting top 5 topics
topic_words_ds_raw, word_scores_ds_raw, topic_nums_ds_raw = top2vec_ds_raw.get_topics(3)
# plotting word clouds for the top 3 topics
for topic in topic_nums_ds_raw:
top2vec_ds_raw.generate_topic_wordcloud(topic)
# parsing raw data analyst text
da_raw = list(data[data['JobType'] == 'Data Analyst']['JobDescription'].values)
da_raw[:5]
# fitting the Top2Vec model using USE embeddings on the raw data analyst text
# top2vec_da_raw = Top2Vec(da_raw,
# speed = 'deep-learn',
# embedding_model = 'universal-sentence-encoder')
# saving raw Top2Vec model
# top2vec_da_raw.save(root_path + "/data/top2vec_da_raw")
# loading raw Top2Vec model
top2vec_da_raw = Top2Vec.load(root_path + "/data/top2vec_da_raw")
# getting number of topics
top2vec_da_raw.get_num_topics()
# getting top 5 topics
topic_words_da_raw, word_scores_da_raw, topic_nums_da_raw = top2vec_da_raw.get_topics(2)
# plotting word clouds for the top 5 topics
for topic in topic_nums_da_raw:
top2vec_da_raw.generate_topic_wordcloud(topic)
ba_raw = list(data[data['JobType'] == 'Business Analyst']['JobDescription'].values)
ba_raw[:5]
# fitting the Top2Vec model using USE embeddings on the raw job description text
# top2vec_ba_raw = Top2Vec(ba_raw,
# speed = 'deep-learn',
# embedding_model = 'universal-sentence-encoder')
# saving raw Top2Vec model
# top2vec_ba_raw.save(root_path + "/data/top2vec_ba_raw")
# loading raw Top2Vec model
top2vec_ba_raw = Top2Vec.load(root_path + "/data/top2vec_ba_raw")
# getting number of topics
top2vec_ba_raw.get_num_topics()
# getting top 5 topics
topic_words_ba_raw, word_scores_ba_raw, topic_nums_ba_raw = top2vec_ba_raw.get_topics(3)
# plotting word clouds for the top 5 topics
for topic in topic_nums_ba_raw:
top2vec_ba_raw.generate_topic_wordcloud(topic)
# importing TensorFlow
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras import layers
from keras.layers import Dropout
from keras.models import Model
# Define model output visulization
def plot_history(history):
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
x = range(1, len(acc) + 1)
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(x, acc, 'b', label='Training acc')
plt.plot(x, val_acc, 'r', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()
plt.subplot(1, 2, 2)
plt.plot(x, loss, 'b', label='Training loss')
plt.plot(x, val_loss, 'r', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
# Create copy dataframe for neural network, encoding and splitting data
nn_df = data[['JobTitle','JobDescription','JobType','SalaryAvg']]
X = nn_df.drop('JobType',axis=1)
y = nn_df['JobType']
encoder = LabelEncoder()
encoder.fit(y)
encoded_y = encoder.transform(y)
X_train_title, X_test_title, y_train_true, y_test_true = train_test_split(X['JobTitle'].values, encoded_y, test_size = 0.2, random_state = 42)
# Categorize target variable to be applied in sequential model
y_train = tf.keras.utils.to_categorical(y_train_true, 3)
y_test = tf.keras.utils.to_categorical(y_test_true, 3)
# Applied GloVe word embeddings
# We would only focus on limited number of words in GloVe vocabulary and skip some words
def create_embedding_matrix(filepath, word_index, embedding_dim):
vocab_size = len(word_index) + 1 # Adding 1 because of reserved 0 index
embedding_matrix = np.zeros((vocab_size, embedding_dim))
with open(filepath) as f:
for line in f:
word, *vector = line.split()
if word in word_index:
idx = word_index[word]
embedding_matrix[idx] = np.array(
vector, dtype=np.float32)[:embedding_dim]
return embedding_matrix
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=50)
tokenizer.fit_on_texts(X_train_title)
X_train_title = tokenizer.texts_to_sequences(X_train_title)
X_test_title = tokenizer.texts_to_sequences(X_test_title)
maxlen_title = 50
X_train_title = pad_sequences(X_train_title, padding='post', maxlen=maxlen_title)
X_test_title = pad_sequences(X_test_title, padding='post', maxlen=maxlen_title)
embedding_dim = 100
embedding_matrix_title = create_embedding_matrix('/content/drive/Shareddrives/Data Mining Group Project/data/glove.6B.100d.txt',tokenizer.word_index, embedding_dim)
nonzero_elements = np.count_nonzero(np.count_nonzero(embedding_matrix_title, axis=1))
vocab_size_title = len(tokenizer.word_index) + 1
nonzero_elements / vocab_size_title
model = Sequential()
model.add(layers.Embedding(vocab_size_title, embedding_dim,
weights=[embedding_matrix_title],
input_length=maxlen_title,
trainable=True))
model.add(layers.GlobalMaxPool1D())
model.add(layers.Dense(25, activation='relu'))
model.add(Dropout(0.2))
model.add(layers.Dense(3, activation='softmax'))
model.compile(optimizer='adam',
loss='categorical_crossentropy',
metrics=['accuracy'])
model.summary()
history = model.fit(X_train_title, y_train,
epochs=20,
verbose=False,
validation_data=(X_test_title, y_test),
batch_size=10)
loss, accuracy = model.evaluate(X_train_title, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test_title, y_test, verbose=False)
print("Testing Accuracy: {:.4f}".format(accuracy))
plot_history(history)
# Check Claasification Report for job title
result_train = model.predict(X_train_title)
pr_train = [np.argmax(x) for x in result_train]
result_test = model.predict(X_test_title)
pr_test = [np.argmax(x) for x in result_test]
print(classification_report(y_train_true, pr_train))
print(classification_report(y_test_true, pr_test))
# With the same random state value, we don't need to create new target variable again
X_train, X_test = train_test_split(X['JobDescription'].values, test_size = 0.2, random_state = 42)
# Tokenize words based on word values from the dictionary tokenizer.word_index
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
# After processing above, we have text sequences that in most cases different length of words. Handle the length issue as below: pad sequence of words with zeros
maxlen = 800
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)
# Retrieve the embedding matrix
embedding_dim = 100
embedding_matrix = create_embedding_matrix('/content/drive/Shareddrives/Data Mining Group Project/data/glove.6B.100d.txt',tokenizer.word_index, embedding_dim)
# Check how many embedding vectors are nonzero
nonzero_elements = np.count_nonzero(np.count_nonzero(embedding_matrix, axis=1))
vocab_size = len(tokenizer.word_index) + 1
nonzero_elements / vocab_size
# Initialize sequential model with pretrained weights from GloVe
maxlen = 800
model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim,
weights=[embedding_matrix],
input_length=maxlen,
trainable=True))
model.add(layers.GlobalMaxPool1D())
model.add(layers.Dense(50, activation='relu'))
model.add(Dropout(0.2))
model.add(layers.Dense(25, activation='relu'))
model.add(Dropout(0.2))
model.add(layers.Dense(3, activation='softmax'))
model.compile(optimizer='adam',
loss='categorical_crossentropy',
metrics=['accuracy'])
model.summary()
# Fit the model with epochs as 20 and plot history performance of the model
history = model.fit(X_train, y_train,
epochs=20,
verbose=False,
validation_data=(X_test, y_test),
batch_size=10)
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy: {:.4f}".format(accuracy))
plot_history(history)
# Get prediction from the model
result_train = model.predict(X_train)
pr_train = [np.argmax(x) for x in result_train]
result_test = model.predict(X_test)
pr_test = [np.argmax(x) for x in result_test]
# Check classification results
print(classification_report(y_train_true, pr_train))
print(classification_report(y_test_true, pr_test))
# Define model output visulization for featrue with salary
def plot_history_salary(history):
acc = history.history['accuracy'][1:]
val_acc = history.history['val_accuracy'][1:]
loss = history.history['loss'][1:]
val_loss = history.history['val_loss'][1:]
x = range(1, len(acc) + 1)
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(x, acc, 'b', label='Training acc')
plt.plot(x, val_acc, 'r', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()
plt.subplot(1, 2, 2)
plt.plot(x, loss, 'b', label='Training loss')
plt.plot(x, val_loss, 'r', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
# With the same random state value, combine SalaryAvg to the model
X_train2, X_test2, = train_test_split(X['SalaryAvg'].values, test_size = 0.2, random_state = 42)
input_1 = layers.Input(shape=(maxlen_title,))
input_2 = layers.Input(shape=(1,))
embedding_layer = layers.Embedding(vocab_size_title, embedding_dim,
weights=[embedding_matrix_title],
trainable=True)(input_1)
globalpool = layers.GlobalMaxPool1D()(embedding_layer)
dense_layer_1 = layers.Dense(5, activation='relu')(input_2)
# dense_layer_2 = Dense(10, activation='relu')(dense_layer_1)
concat_layer = layers.Concatenate()([globalpool, dense_layer_1])
dense_layer_3 = layers.Dense(50, activation='relu')(concat_layer)
dropout_1 = layers.Dropout(0.2)(dense_layer_3)
dense_layer_4 = layers.Dense(25, activation='relu')(dropout_1)
dropout_2 = layers.Dropout(0.2)(dense_layer_4)
output = layers.Dense(3, activation='softmax')(dropout_2)
model = Model(inputs=[input_1, input_2], outputs=output)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
history = model.fit(x=[X_train_title,X_train2], y=y_train,
epochs=20,
verbose=False,
validation_data=([X_test_title,X_test2], y_test),
batch_size=10)
loss, accuracy = model.evaluate(x=[X_train_title,X_train2], y=y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(x=[X_test_title,X_test2], y=y_test, verbose=False)
print("Testing Accuracy: {:.4f}".format(accuracy))
plot_history_salary(history)
result_train = model.predict([X_train_title,X_train2])
pr_train = [np.argmax(x) for x in result_train]
result_test = model.predict([X_test_title,X_test2])
pr_test = [np.argmax(x) for x in result_test]
print(classification_report(y_train_true, pr_train))
print(classification_report(y_test_true, pr_test))
input_1 = layers.Input(shape=(maxlen,))
input_2 = layers.Input(shape=(1,))
embedding_layer = layers.Embedding(vocab_size, embedding_dim,
weights=[embedding_matrix],
trainable=True)(input_1)
globalpool = layers.GlobalMaxPool1D()(embedding_layer)
dense_layer_1 = layers.Dense(5, activation='relu')(input_2)
# dense_layer_2 = Dense(10, activation='relu')(dense_layer_1)
concat_layer = layers.Concatenate()([globalpool, dense_layer_1])
dense_layer_3 = layers.Dense(50, activation='relu')(concat_layer)
dropout_1 = layers.Dropout(0.2)(dense_layer_3)
dense_layer_4 = layers.Dense(25, activation='relu')(dropout_1)
dropout_2 = layers.Dropout(0.2)(dense_layer_4)
output = layers.Dense(3, activation='softmax')(dropout_2)
model = Model(inputs=[input_1, input_2], outputs=output)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
history = model.fit(x=[X_train,X_train2], y=y_train,
epochs=20,
verbose=1,
validation_data=([X_test,X_test2], y_test),
batch_size=10)
loss, accuracy = model.evaluate(x=[X_train,X_train2], y=y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(x=[X_test,X_test2], y=y_test, verbose=False)
print("Testing Accuracy: {:.4f}".format(accuracy))
plot_history_salary(history)
result_train = model.predict([X_train,X_train2])
pr_train = [np.argmax(x) for x in result_train]
result_test = model.predict([X_test,X_test2])
pr_test = [np.argmax(x) for x in result_test]
print(classification_report(y_train_true, pr_train))
print(classification_report(y_test_true, pr_test))
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KernelDensity
from sklearn.decomposition import TruncatedSVD, NMF, PCA, LatentDirichletAllocation
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans, DBSCAN
from sklearn.manifold import TSNE
from sklearn.feature_extraction import text
from sklearn.preprocessing import normalize, LabelEncoder
from nltk.stem.snowball import SnowballStemmer
import pickle
# Base - SVM
text_clf_svm = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf-svm', SGDClassifier(loss='hinge',
penalty='l2',
alpha=1e-3,
random_state=1))])
text_clf_svm = text_clf_svm.fit(X_train['JobTitle'], y_train)
predicted_svm = text_clf_svm.predict(X_test['JobTitle'])
np.mean(predicted_svm == y_test)
# Grid Search
# Here, we are creating a list of parameters for which we would like to do performance tuning.
# All the parameters name start with the classifier name (remember the arbitrary name we gave).
# E.g. vect__ngram_range; here we are telling to use unigram and bigrams and choose the one which is optimal.
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
'tfidf__use_idf': (True, False),
'clf__alpha': (1e-2, 1e-3)}
# Similarly doing grid search for SVM
parameters_svm = {'vect__ngram_range': [(1, 1), (1, 2)],
'tfidf__use_idf': (True, False),
'clf-svm__alpha': (1e-2, 1e-3)}
gs_clf_svm = GridSearchCV(text_clf_svm, parameters_svm, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(X_train['JobTitle'], y_train)
pickle.dump(gs_clf_svm,
open(root_path + '/data/jt_svm_base.sav', 'wb'))
print(gs_clf_svm.best_score_)
print(gs_clf_svm.best_params_)
# NLTK - SVM
# Removing stop words
text_clf_svm = Pipeline([('vect', CountVectorizer(stop_words = 'english')),
('tfidf', TfidfTransformer()),
('clf-svm', SGDClassifier(loss='hinge',
penalty='l2',
alpha=1e-3,
random_state=1))])
text_clf_svm = text_clf_svm.fit(X_train['JobTitle'], y_train)
predicted_svm = text_clf_svm.predict(X_test['JobTitle'])
np.mean(predicted_svm == y_test)
parameters_svm = {'vect__ngram_range': [(1, 1), (1, 2)],
'tfidf__use_idf': (True, False),
'clf-svm__alpha': (1e-2, 1e-3)}
gs_clf_svm = GridSearchCV(text_clf_svm, parameters_svm, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(X_train['JobTitle'], y_train)
pickle.dump(gs_clf_svm,
open(root_path + '/data/jt_svm_stopwords.sav', 'wb'))
print(gs_clf_svm.best_score_)
print(gs_clf_svm.best_params_)
# define a class to build a CountVectorizer on a stemmed text using Snowball
# stemmer. this is important to run even though we might be loading the model
# from disk
class StemmedCountVectorizer(CountVectorizer):
def build_analyzer(self):
analyzer = super(StemmedCountVectorizer, self).build_analyzer()
return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])
stemmer = SnowballStemmer("english", ignore_stopwords=True)
# Stemming Code - SVM
stemmer = SnowballStemmer("english", ignore_stopwords=True)
stemmed_count_vect = StemmedCountVectorizer(stop_words='english')
text_svm_stemmed = Pipeline([('vect', stemmed_count_vect),
('tfidf', TfidfTransformer()),
('clf-svm', SGDClassifier(loss='hinge',
penalty='l2',
alpha=1e-3,
random_state=1))])
text_svm_stemmed = text_svm_stemmed.fit(X_train['JobTitle'], y_train)
predicted_svm_stemmed = text_svm_stemmed.predict(X_test['JobTitle'])
np.mean(predicted_svm_stemmed == y_test)
parameters_svm = {'vect__ngram_range': [(1, 1), (1, 2)],
'tfidf__use_idf': (True, False),
'clf-svm__alpha': (1e-2, 1e-3)}
gs_clf_svm = GridSearchCV(text_svm_stemmed, parameters_svm, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(X_train['JobTitle'], y_train)
pickle.dump(gs_clf_svm,
open(root_path + '/data/jt_svm_stemmed.sav', 'wb'))
print(gs_clf_svm.best_score_)
print(gs_clf_svm.best_params_)
# loading models
jt_svm_base = pickle.load(open(root_path + '/data/jt_svm_base.sav', 'rb'))
jt_svm_stopwords = pickle.load(open(root_path + '/data/jt_svm_stopwords.sav', 'rb'))
jt_svm_stemmed = pickle.load(open(root_path + '/data/jt_svm_stemmed.sav', 'rb'))
# base SVM classification report on the testing data
print(classification_report(y_test, jt_svm_base.predict(X_test['JobTitle'])))
print(confusion_matrix(y_test, jt_svm_base.predict(X_test['JobTitle'])))
# stemmed SVM classification report on the testing data
print(classification_report(y_test, jt_svm_stemmed.predict(X_test['JobTitle'])))
print(confusion_matrix(y_test, jt_svm_stemmed.predict(X_test['JobTitle'])))
# removed stopwords SVM classification report on the testing data
print(classification_report(y_test, jt_svm_stopwords.predict(X_test['JobTitle'])))
print(confusion_matrix(y_test, jt_svm_stopwords.predict(X_test['JobTitle'])))
# Base - SVM
# we first define a pipeline that takes as input the raw JobDescription text and
# applies a CountVectorizer, a TF-IDF transformation, and a SVM, sequentially.
text_clf_svm = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf-svm', SGDClassifier(loss = 'hinge',
penalty = 'l2',
alpha = 1e-3,
random_state = 1))])
# the pipeline is fitted on the train data and an untuned accuracy is printed.
text_clf_svm = text_clf_svm.fit(X_train['JobDescription'], y_train)
predicted_svm = text_clf_svm.predict(X_test['JobDescription'])
print(np.mean(predicted_svm == y_test))
# we then define a grid of parameters for the CountVectorizer, TF-IDF, and SVM
# to perform a grid search for hyperparameter tuning and calculate the best
# results based on the training dataset
parameters_svm = {'vect__ngram_range': [(1, 1), (1, 2)],
'tfidf__use_idf': (True, False),
'clf-svm__alpha': (1e-2, 1e-3)}
gs_clf_svm = GridSearchCV(text_clf_svm, parameters_svm, n_jobs = -1)
gs_clf_svm = gs_clf_svm.fit(X_train['JobDescription'], y_train)
# we pickle the model to avoid having to run it again
pickle.dump(gs_clf_svm,
open(root_path + '/data/jd_svm_base.sav', 'wb'))
# these are the best scores and parameters that resulted from the grid search
print(gs_clf_svm.best_score_)
print(gs_clf_svm.best_params_)
# NLTK - SVM
# Removing stop words. We perform the exact same pipeline, but in this case
# we remove the common english stop words.
text_clf_svm = Pipeline([('vect', CountVectorizer(stop_words = 'english')),
('tfidf', TfidfTransformer()),
('clf-svm', SGDClassifier(loss='hinge',
penalty='l2',
alpha=1e-3,
random_state=1))])
text_clf_svm = text_clf_svm.fit(X_train['JobDescription'], y_train)
predicted_svm = text_clf_svm.predict(X_test['JobDescription'])
print(np.mean(predicted_svm == y_test))
parameters_svm = {'vect__ngram_range': [(1, 1), (1, 2)],
'tfidf__use_idf': (True, False),
'clf-svm__alpha': (1e-2, 1e-3)}
gs_clf_svm = GridSearchCV(text_clf_svm, parameters_svm, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(X_train['JobDescription'], y_train)
pickle.dump(gs_clf_svm,
open(root_path + '/data/jd_svm_stopwords.sav', 'wb'))
print(gs_clf_svm.best_score_)
print(gs_clf_svm.best_params_)
# Stemming Code - SVM
# in this final model, we remove stopwords and we stem the words to their roots
# using the previously defined class
stemmer = SnowballStemmer("english", ignore_stopwords=True)
stemmed_count_vect = StemmedCountVectorizer(stop_words='english')
text_svm_stemmed = Pipeline([('vect', stemmed_count_vect),
('tfidf', TfidfTransformer()),
('clf-svm', SGDClassifier(loss='hinge',
penalty='l2',
alpha=1e-3,
random_state=1))])
text_svm_stemmed = text_svm_stemmed.fit(X_train['JobDescription'], y_train)
predicted_svm_stemmed = text_svm_stemmed.predict(X_test['JobDescription'])
print(np.mean(predicted_svm_stemmed == y_test))
parameters_svm = {'vect__ngram_range': [(1, 1), (1, 2)],
'tfidf__use_idf': (True, False),
'clf-svm__alpha': (1e-2, 1e-3)}
gs_clf_svm = GridSearchCV(text_svm_stemmed, parameters_svm, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(X_train['JobDescription'], y_train)
pickle.dump(gs_clf_svm,
open(root_path + '/data/jd_svm_stemmed.sav', 'wb'))
print(gs_clf_svm.best_score_)
print(gs_clf_svm.best_params_)
# loading models
jd_svm_base = pickle.load(open(root_path + '/data/jd_svm_base.sav', 'rb'))
jd_svm_stopwords = pickle.load(open(root_path + '/data/jd_svm_stopwords.sav', 'rb'))
jd_svm_stemmed = pickle.load(open(root_path + '/data/jd_svm_stemmed.sav', 'rb'))
# base SVM classification report on the testing data
print(classification_report(y_test, jd_svm_base.predict(X_test['JobDescription'])))
print(confusion_matrix(y_test, jd_svm_base.predict(X_test['JobDescription'])))
# stemmed SVM classification report on the testing data
print(classification_report(y_test, jd_svm_stemmed.predict(X_test['JobDescription'])))
print(confusion_matrix(y_test, jd_svm_stemmed.predict(X_test['JobDescription'])))
# removed stopwords SVM classification report on the testing data
print(classification_report(y_test, jd_svm_stopwords.predict(X_test['JobDescription'])))
print(confusion_matrix(y_test, jd_svm_stopwords.predict(X_test['JobDescription'])))